{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "import unicodedata\n",
    "from nltk.sentiment.util import *\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package vader_lexicon to /home/bf/nltk_data...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('vader_lexicon')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import CSV file with all technical data and news"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# encoding to avoid UnicodeDecodeError\n",
    "data = pd.read_hdf(\"data.h5\", \"data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Open</th>\n",
       "      <th>High</th>\n",
       "      <th>Low</th>\n",
       "      <th>Close</th>\n",
       "      <th>Adj Close</th>\n",
       "      <th>Volume</th>\n",
       "      <th>lrets</th>\n",
       "      <th>MACD</th>\n",
       "      <th>stochastics</th>\n",
       "      <th>ATR</th>\n",
       "      <th>News</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1990-01-01</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Warm-Weather Sissies?A Proud Beginning to 1990...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-02</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Guest Supply Inc reports earnings for Qtr to S...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-03</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>For Judaism's Remnant, Coup Is Mixed BlessingP...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-04</th>\n",
       "      <td>358.760010</td>\n",
       "      <td>358.760010</td>\n",
       "      <td>352.890015</td>\n",
       "      <td>355.670013</td>\n",
       "      <td>355.670013</td>\n",
       "      <td>177000000.0</td>\n",
       "      <td>-0.008650</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.869995</td>\n",
       "      <td>Group W Sports GainsCooney's Common Denominato...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-05</th>\n",
       "      <td>355.670013</td>\n",
       "      <td>355.670013</td>\n",
       "      <td>351.350006</td>\n",
       "      <td>352.200012</td>\n",
       "      <td>352.200012</td>\n",
       "      <td>158530000.0</td>\n",
       "      <td>-0.009804</td>\n",
       "      <td>-0.036878</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.320007</td>\n",
       "      <td>President Wins Bipartisan Praise For Solution ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  Open        High         Low       Close   Adj Close  \\\n",
       "1990-01-01         NaN         NaN         NaN         NaN         NaN   \n",
       "1990-01-02         NaN         NaN         NaN         NaN         NaN   \n",
       "1990-01-03         NaN         NaN         NaN         NaN         NaN   \n",
       "1990-01-04  358.760010  358.760010  352.890015  355.670013  355.670013   \n",
       "1990-01-05  355.670013  355.670013  351.350006  352.200012  352.200012   \n",
       "\n",
       "                 Volume     lrets      MACD  stochastics       ATR  \\\n",
       "1990-01-01          NaN       NaN       NaN          NaN       NaN   \n",
       "1990-01-02          NaN       NaN       NaN          NaN       NaN   \n",
       "1990-01-03          NaN       NaN       NaN          NaN       NaN   \n",
       "1990-01-04  177000000.0 -0.008650  0.000000          NaN  5.869995   \n",
       "1990-01-05  158530000.0 -0.009804 -0.036878          NaN  4.320007   \n",
       "\n",
       "                                                         News  \n",
       "1990-01-01  Warm-Weather Sissies?A Proud Beginning to 1990...  \n",
       "1990-01-02  Guest Supply Inc reports earnings for Qtr to S...  \n",
       "1990-01-03  For Judaism's Remnant, Coup Is Mixed BlessingP...  \n",
       "1990-01-04  Group W Sports GainsCooney's Common Denominato...  \n",
       "1990-01-05  President Wins Bipartisan Praise For Solution ...  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove data first day"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/bf/anaconda3/envs/dasc/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: \n",
      ".ix is deprecated. Please use\n",
      ".loc for label based indexing or\n",
      ".iloc for positional indexing\n",
      "\n",
      "See the documentation here:\n",
      "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Open</th>\n",
       "      <th>High</th>\n",
       "      <th>Low</th>\n",
       "      <th>Close</th>\n",
       "      <th>Adj Close</th>\n",
       "      <th>Volume</th>\n",
       "      <th>lrets</th>\n",
       "      <th>MACD</th>\n",
       "      <th>stochastics</th>\n",
       "      <th>ATR</th>\n",
       "      <th>News</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1990-01-02</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Guest Supply Inc reports earnings for Qtr to S...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-03</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>For Judaism's Remnant, Coup Is Mixed BlessingP...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-04</th>\n",
       "      <td>358.760010</td>\n",
       "      <td>358.760010</td>\n",
       "      <td>352.890015</td>\n",
       "      <td>355.670013</td>\n",
       "      <td>355.670013</td>\n",
       "      <td>177000000.0</td>\n",
       "      <td>-0.008650</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.869995</td>\n",
       "      <td>Group W Sports GainsCooney's Common Denominato...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-05</th>\n",
       "      <td>355.670013</td>\n",
       "      <td>355.670013</td>\n",
       "      <td>351.350006</td>\n",
       "      <td>352.200012</td>\n",
       "      <td>352.200012</td>\n",
       "      <td>158530000.0</td>\n",
       "      <td>-0.009804</td>\n",
       "      <td>-0.036878</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.320007</td>\n",
       "      <td>President Wins Bipartisan Praise For Solution ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-01-06</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CorrectionsFrom Jewish Eden to EmbarrassmentAw...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  Open        High         Low       Close   Adj Close  \\\n",
       "1990-01-02         NaN         NaN         NaN         NaN         NaN   \n",
       "1990-01-03         NaN         NaN         NaN         NaN         NaN   \n",
       "1990-01-04  358.760010  358.760010  352.890015  355.670013  355.670013   \n",
       "1990-01-05  355.670013  355.670013  351.350006  352.200012  352.200012   \n",
       "1990-01-06         NaN         NaN         NaN         NaN         NaN   \n",
       "\n",
       "                 Volume     lrets      MACD  stochastics       ATR  \\\n",
       "1990-01-02          NaN       NaN       NaN          NaN       NaN   \n",
       "1990-01-03          NaN       NaN       NaN          NaN       NaN   \n",
       "1990-01-04  177000000.0 -0.008650  0.000000          NaN  5.869995   \n",
       "1990-01-05  158530000.0 -0.009804 -0.036878          NaN  4.320007   \n",
       "1990-01-06          NaN       NaN       NaN          NaN       NaN   \n",
       "\n",
       "                                                         News  \n",
       "1990-01-02  Guest Supply Inc reports earnings for Qtr to S...  \n",
       "1990-01-03  For Judaism's Remnant, Coup Is Mixed BlessingP...  \n",
       "1990-01-04  Group W Sports GainsCooney's Common Denominato...  \n",
       "1990-01-05  President Wins Bipartisan Praise For Solution ...  \n",
       "1990-01-06  CorrectionsFrom Jewish Eden to EmbarrassmentAw...  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = data.ix[\"1990-01-02\":]\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove holiday \n",
    "Feel free to use forward fill to replace Nan value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Open</th>\n",
       "      <th>High</th>\n",
       "      <th>Low</th>\n",
       "      <th>Close</th>\n",
       "      <th>Adj Close</th>\n",
       "      <th>Volume</th>\n",
       "      <th>lrets</th>\n",
       "      <th>MACD</th>\n",
       "      <th>stochastics</th>\n",
       "      <th>ATR</th>\n",
       "      <th>News</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1990-01-30</th>\n",
       "      <td>325.200012</td>\n",
       "      <td>325.730011</td>\n",
       "      <td>319.829987</td>\n",
       "      <td>322.980011</td>\n",
       "      <td>322.980011</td>\n",
       "      <td>186030000.0</td>\n",
       "      <td>-0.006850</td>\n",
       "      <td>-1.512383</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.900024</td>\n",
       "      <td>The Moment, Missed;   Mr. Bush on Defense: Too...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-01</th>\n",
       "      <td>329.079987</td>\n",
       "      <td>329.859985</td>\n",
       "      <td>327.760010</td>\n",
       "      <td>328.790009</td>\n",
       "      <td>328.790009</td>\n",
       "      <td>154580000.0</td>\n",
       "      <td>-0.000882</td>\n",
       "      <td>-1.185424</td>\n",
       "      <td>18.857509</td>\n",
       "      <td>6.879974</td>\n",
       "      <td>Samuel C. Phillips, Who Directed Apollo Lunar ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-05</th>\n",
       "      <td>330.920013</td>\n",
       "      <td>332.160004</td>\n",
       "      <td>330.450012</td>\n",
       "      <td>331.850006</td>\n",
       "      <td>331.850006</td>\n",
       "      <td>130950000.0</td>\n",
       "      <td>0.002806</td>\n",
       "      <td>-0.738707</td>\n",
       "      <td>28.789340</td>\n",
       "      <td>3.369995</td>\n",
       "      <td>Costly Pitfalls in Worker RetrainingMemorial o...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-08</th>\n",
       "      <td>333.750000</td>\n",
       "      <td>336.089996</td>\n",
       "      <td>332.000000</td>\n",
       "      <td>332.959991</td>\n",
       "      <td>332.959991</td>\n",
       "      <td>176240000.0</td>\n",
       "      <td>-0.002370</td>\n",
       "      <td>-0.328807</td>\n",
       "      <td>39.060605</td>\n",
       "      <td>4.239990</td>\n",
       "      <td>No End to the ThreatTempest Technologies repor...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-09</th>\n",
       "      <td>333.019989</td>\n",
       "      <td>334.600006</td>\n",
       "      <td>332.410004</td>\n",
       "      <td>333.619995</td>\n",
       "      <td>333.619995</td>\n",
       "      <td>146910000.0</td>\n",
       "      <td>0.001980</td>\n",
       "      <td>0.019442</td>\n",
       "      <td>41.643793</td>\n",
       "      <td>2.190002</td>\n",
       "      <td>LAWYER CONVICTED IN ABDUCTION PLOTOil Tanker i...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  Open        High         Low       Close   Adj Close  \\\n",
       "1990-01-30  325.200012  325.730011  319.829987  322.980011  322.980011   \n",
       "1990-02-01  329.079987  329.859985  327.760010  328.790009  328.790009   \n",
       "1990-02-05  330.920013  332.160004  330.450012  331.850006  331.850006   \n",
       "1990-02-08  333.750000  336.089996  332.000000  332.959991  332.959991   \n",
       "1990-02-09  333.019989  334.600006  332.410004  333.619995  333.619995   \n",
       "\n",
       "                 Volume     lrets      MACD  stochastics       ATR  \\\n",
       "1990-01-30  186030000.0 -0.006850 -1.512383     0.000000  5.900024   \n",
       "1990-02-01  154580000.0 -0.000882 -1.185424    18.857509  6.879974   \n",
       "1990-02-05  130950000.0  0.002806 -0.738707    28.789340  3.369995   \n",
       "1990-02-08  176240000.0 -0.002370 -0.328807    39.060605  4.239990   \n",
       "1990-02-09  146910000.0  0.001980  0.019442    41.643793  2.190002   \n",
       "\n",
       "                                                         News  \n",
       "1990-01-30  The Moment, Missed;   Mr. Bush on Defense: Too...  \n",
       "1990-02-01  Samuel C. Phillips, Who Directed Apollo Lunar ...  \n",
       "1990-02-05  Costly Pitfalls in Worker RetrainingMemorial o...  \n",
       "1990-02-08  No End to the ThreatTempest Technologies repor...  \n",
       "1990-02-09  LAWYER CONVICTED IN ABDUCTION PLOTOil Tanker i...  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.dropna(axis=0, inplace=True)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Removing open high low close"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Adj Close</th>\n",
       "      <th>Volume</th>\n",
       "      <th>lrets</th>\n",
       "      <th>MACD</th>\n",
       "      <th>stochastics</th>\n",
       "      <th>ATR</th>\n",
       "      <th>News</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1990-01-30</th>\n",
       "      <td>322.980011</td>\n",
       "      <td>186030000.0</td>\n",
       "      <td>-0.006850</td>\n",
       "      <td>-1.512383</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.900024</td>\n",
       "      <td>The Moment, Missed;   Mr. Bush on Defense: Too...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-01</th>\n",
       "      <td>328.790009</td>\n",
       "      <td>154580000.0</td>\n",
       "      <td>-0.000882</td>\n",
       "      <td>-1.185424</td>\n",
       "      <td>18.857509</td>\n",
       "      <td>6.879974</td>\n",
       "      <td>Samuel C. Phillips, Who Directed Apollo Lunar ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-05</th>\n",
       "      <td>331.850006</td>\n",
       "      <td>130950000.0</td>\n",
       "      <td>0.002806</td>\n",
       "      <td>-0.738707</td>\n",
       "      <td>28.789340</td>\n",
       "      <td>3.369995</td>\n",
       "      <td>Costly Pitfalls in Worker RetrainingMemorial o...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-08</th>\n",
       "      <td>332.959991</td>\n",
       "      <td>176240000.0</td>\n",
       "      <td>-0.002370</td>\n",
       "      <td>-0.328807</td>\n",
       "      <td>39.060605</td>\n",
       "      <td>4.239990</td>\n",
       "      <td>No End to the ThreatTempest Technologies repor...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1990-02-09</th>\n",
       "      <td>333.619995</td>\n",
       "      <td>146910000.0</td>\n",
       "      <td>0.001980</td>\n",
       "      <td>0.019442</td>\n",
       "      <td>41.643793</td>\n",
       "      <td>2.190002</td>\n",
       "      <td>LAWYER CONVICTED IN ABDUCTION PLOTOil Tanker i...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Adj Close       Volume     lrets      MACD  stochastics  \\\n",
       "1990-01-30  322.980011  186030000.0 -0.006850 -1.512383     0.000000   \n",
       "1990-02-01  328.790009  154580000.0 -0.000882 -1.185424    18.857509   \n",
       "1990-02-05  331.850006  130950000.0  0.002806 -0.738707    28.789340   \n",
       "1990-02-08  332.959991  176240000.0 -0.002370 -0.328807    39.060605   \n",
       "1990-02-09  333.619995  146910000.0  0.001980  0.019442    41.643793   \n",
       "\n",
       "                 ATR                                               News  \n",
       "1990-01-30  5.900024  The Moment, Missed;   Mr. Bush on Defense: Too...  \n",
       "1990-02-01  6.879974  Samuel C. Phillips, Who Directed Apollo Lunar ...  \n",
       "1990-02-05  3.369995  Costly Pitfalls in Worker RetrainingMemorial o...  \n",
       "1990-02-08  4.239990  No End to the ThreatTempest Technologies repor...  \n",
       "1990-02-09  2.190002  LAWYER CONVICTED IN ABDUCTION PLOTOil Tanker i...  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using NLTK sentiment analyzer to generate a polarity score\n",
    "\n",
    "Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n",
    "\n",
    "https://www.nltk.org/_modules/nltk/sentiment/sentiment_analyzer.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sentimentanalysis(df):\n",
    "    sid = SentimentIntensityAnalyzer()\n",
    "    for date in df.index:\n",
    "        try:\n",
    "            sentence = unicodedata.normalize('NFKD', df.loc[date, 'News'])\n",
    "            ss = sid.polarity_scores(sentence)\n",
    "            df.at(date, 'neg') = ss['neg']\n",
    "            df.at(date, 'neu') = ss['neu']\n",
    "            df.at(date, 'pos') = ss['pos']\n",
    "        except TypeError:\n",
    "            print (df.loc[date, 'News'])\n",
    "            print (date)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/bf/anaconda3/envs/dasc/lib/python3.6/site-packages/ipykernel_launcher.py:7: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead\n",
      "  import sys\n",
      "/home/bf/anaconda3/envs/dasc/lib/python3.6/site-packages/ipykernel_launcher.py:8: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead\n",
      "  \n",
      "/home/bf/anaconda3/envs/dasc/lib/python3.6/site-packages/ipykernel_launcher.py:9: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "new_data = sentimentanalysis(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "new_data.drop(['News'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "new_data.to_hdf('data2', 'new_data')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check if there is missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Adj Close      False\n",
       "Volume         False\n",
       "lrets          False\n",
       "MACD           False\n",
       "stochastics    False\n",
       "ATR            False\n",
       "neg            False\n",
       "neu            False\n",
       "pos            False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data.isnull().any()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
